In [41]:
import pandas as pd
import plotly.io as pio
import sys
sys.path.append('/Users/kevinanderson/Projects/az_takehome/problem_2')
import utilities
import simulation
import importlib
In [42]:
importlib.reload(simulation)
help(simulation.Simulation)
Help on class Simulation in module simulation:

class Simulation(builtins.object)
 |  Simulation(
 |      sim_name: str,
 |      n: int = 500,
 |      n_visits: int = 10,
 |      event_rate: float = 0.05,
 |      event_rate_change_rate: float = 0.01,
 |      cv_mean: float = 100,
 |      cv_sd: float = 20,
 |      cv_noise_sd: float = 3,
 |      cv_change_over_time: float = -2,
 |      patient_id_start_idx: int = 1,
 |      relationship_coefficient: float = 0.1,
 |      seed: int = 2948
 |  )
 |
 |  Parameters
 |  ----------
 |  n : int
 |      Desired number of patients.
 |
 |  n_visits: int
 |      Desired number of visits across the study.
 |
 |  event_rate: float
 |      range: (0 - 1)
 |      Baseline event rate PER VISIT.
 |      For instance, if the event rate is 0.10, then 10% of patients will experience an event at each visit.
 |
 |  event_rate_change_rate: float
 |      range: (0-1)
 |      The rate at which the event rate changes over time.
 |      For instance, if the event rate is 0.10 and the event_rate_change_rate is 0.01, then the event rate will increase by 1% each visit.
 |
 |  cv_mean: float
 |      The mean of the continuous variable.
 |
 |  cv_sd: float
 |      The standard deviation of the continuous variable.
 |
 |  cv_change_over_time: float
 |      range: (-inf, inf)
 |      unit: CV standard deviations
 |      Set the rate of change of the continuous variable over time.
 |      The code will take this value and divide it by the number of visits to get the change at each visit.
 |      For instance, if cv_change_over_time is -2, then the continuous variable will decrease by 2 standard deviations over the course of the study.
 |
 |  cv_noise_sd: float
 |      The standard deviation of the noise added to the continuous variable at each visit.
 |      If this is set to zero, then the CV for each patient will be a perfect linear trajectory.
 |
 |  relationship_coefficient: float
 |      The strength of the relationship between the continuous variable and the event rate.
 |
 |      If this is set to zero, then the event rate is independent of the continuous variable.
 |      If this is set to a positive value, then the event rate increases as the continuous variable increases.
 |      If this is set to a negative value, then the event rate decreases as the continuous variable increases.
 |
 |      This directly maps to the beta coefficient in a logistic regression model that modulates the
 |      relationship between the continuous variable and the event rate.
 |
 |  patient_id_start_idx: int
 |      The starting index for patient IDs.
 |      This is useful if you want to simulate multiple arms and want to ensure that the patient IDs are unique across arms.
 |
 |  seed: int
 |      The random seed for reproducibility.
 |
 |  Methods defined here:
 |
 |  __init__(
 |      self,
 |      sim_name: str,
 |      n: int = 500,
 |      n_visits: int = 10,
 |      event_rate: float = 0.05,
 |      event_rate_change_rate: float = 0.01,
 |      cv_mean: float = 100,
 |      cv_sd: float = 20,
 |      cv_noise_sd: float = 3,
 |      cv_change_over_time: float = -2,
 |      patient_id_start_idx: int = 1,
 |      relationship_coefficient: float = 0.1,
 |      seed: int = 2948
 |  )
 |      Initialize self.  See help(type(self)) for accurate signature.
 |
 |  format_output(
 |      self,
 |      event_df=<class 'pandas.core.frame.DataFrame'>,
 |      visit_cv_df=<class 'pandas.core.frame.DataFrame'>,
 |      patient_ids=typing.List[int],
 |      format='long'
 |  )
 |
 |  plot_continous_variable_over_time(
 |      self,
 |      cv_plot_df: pandas.core.frame.DataFrame,
 |      event_plot_df: pandas.core.frame.DataFrame
 |  )
 |
 |  run_cox_timevary_cox_propotional_hazard(
 |      self,
 |      cv_data: pandas.core.frame.DataFrame,
 |      death_data: pandas.core.frame.DataFrame
 |  )
 |
 |  run_simulation(
 |      self,
 |      patient_ids: List[int],
 |      arm_name: str,
 |      arm_n: int,
 |      event_rate: float,
 |      event_rate_change_rate: float,
 |      cv_mean: float,
 |      cv_sd: float,
 |      cv_std_change_over_time: float,
 |      cv_noise_sd: float,
 |      relationship_coefficient: float
 |  )
 |
 |  simulate(self)
 |
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |
 |  __dict__
 |      dictionary for instance variables
 |
 |  __weakref__
 |      list of weak references to the object

Vignette 1: Demonstrate how to modulate chages in the continuous variable over time¶

For this simulation, we set the baseline event rate to zero to isolate how modifying the cv_change_over_time changes the trajectory of the continuous variable across study visits

In [46]:
# Low Dependency Simulation 
importlib.reload(simulation)
pio.renderers.default = 'notebook'

# Instantiate simulation object and parameters
n = 500
change_sim_obj = simulation.Simulation(sim_name='change_over_time',
                                    n = n,
                                    n_visits = 10,
                                    event_rate = 0,
                                    event_rate_change_rate = 0,
                                    cv_mean = 100,
                                    cv_sd = 10,
                                    cv_noise_sd = 3,
                                    cv_change_over_time = -2, # CHANGE 2 STDEVs over course of study
                                    patient_id_start_idx = 1,
                                    relationship_coefficient = .1,
                                    )
# No change over time
nochange_sim_obj = simulation.Simulation(sim_name='nochange_over_time',
                                    n = n,
                                    n_visits = 10,
                                    event_rate = 0,
                                    event_rate_change_rate = 0,
                                    cv_mean = 100,
                                    cv_sd = 10,
                                    cv_noise_sd = 3,
                                    cv_change_over_time = 0, # CHANGE 0 STDEVs over course of study
                                    relationship_coefficient = .1,
                                    patient_id_start_idx = n+1
                                    )

# run the simulation
cv_data_change, death_data_change = change_sim_obj.simulate()
cv_data_nochange, death_data_nochange = nochange_sim_obj.simulate()

# concetaenate the dataframes for plotting
cv_plot_df = pd.concat([cv_data_change, cv_data_nochange])
death_plot_df = pd.concat([death_data_change, death_data_nochange])

# make the figure
fig = change_sim_obj.plot_continous_variable_over_time(cv_plot_df, death_plot_df)
fig.show()
/Users/kevinanderson/Projects/az_takehome/problem_2/simulation.py:271: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

Vignette 2: Demonstrate how to modulate risk of death over time¶

For this simulation, the baseline event rate is set to 1% for both runs. However, the Event Rate Increase simulation models a 2% increase in the event rate over time

In [55]:
# Low Dependency Simulation 
pio.renderers.default = 'notebook'

importlib.reload(simulation)
# Instantiate simulation object and parameters
n = 500
change_sim_obj = simulation.Simulation(sim_name='Event Rate Increase',
                                    n = n,
                                    n_visits = 10,
                                    event_rate = 0.01, # 1% event rate
                                    event_rate_change_rate = 0.02, # 2% increase in event rate over time
                                    cv_mean = 100,
                                    cv_sd = 10,
                                    cv_noise_sd = 0,
                                    cv_change_over_time = -1,
                                    patient_id_start_idx = 1,
                                    relationship_coefficient = 0,
                                    )
# No change over time
nochange_sim_obj = simulation.Simulation(sim_name='Event Rate Constant',
                                    n = n,
                                    n_visits = 10,
                                    event_rate = 0.01, # 1% event rate
                                    event_rate_change_rate = 0, # NO CHANGE in event rate over time
                                    cv_mean = 100,
                                    cv_sd = 10,
                                    cv_noise_sd = 0,
                                    cv_change_over_time = -1, 
                                    relationship_coefficient = 0,
                                    patient_id_start_idx = n+1
                                    )

# run the simulation
cv_data_change, death_data_change = change_sim_obj.simulate()
cv_data_nochange, death_data_nochange = nochange_sim_obj.simulate()

# concetaenate the dataframes for plotting
cv_plot_df = pd.concat([cv_data_change, cv_data_nochange])
death_plot_df = pd.concat([death_data_change, death_data_nochange])

# make the figure
fig = change_sim_obj.plot_continous_variable_over_time(cv_plot_df, death_plot_df)

# change figure dimensions
fig.update_layout(
    height=1000,
)
pio.renderers.default = 'notebook'
fig
/Users/kevinanderson/Projects/az_takehome/problem_2/simulation.py:271: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

Problem 2: Model High vs Low dependence between the continuous variable and the event (i.e. death)

  • Relationship coefficient is essentially a beta within a logistic regression function.
  • For exampke, if the relationship_coefficient=0.10, then a 1 unit increase in the CV would lead to a 0.10 increase in the log-odds of the event.
  • I also display the results of a time-varying cox proportinoal hazards model to verify that High Dependence scenario produces stronger dependence between the CV and the binary event.
In [58]:
# Low Dependency Simulation 
pio.renderers.default = 'notebook'

importlib.reload(simulation)
# Instantiate simulation object and parameters
n = 500
high_sim_obj = simulation.Simulation(sim_name='High Dependence',
                                    n = n,
                                    n_visits = 10,
                                    event_rate = 0.02,
                                    event_rate_change_rate = 0.001,
                                    cv_mean = 100,
                                    cv_sd = 10,
                                    cv_noise_sd = 1,
                                    cv_change_over_time = -2,
                                    patient_id_start_idx = 1,
                                    relationship_coefficient = 0.10,
                                    seed=4444,
                                    )
# No change over time
nochange_sim_obj = simulation.Simulation(sim_name='No Dependence',
                                    n = n,
                                    n_visits = 10,
                                    event_rate = 0.02,
                                    event_rate_change_rate = 0.001,
                                    cv_mean = 100,
                                    cv_sd = 10,
                                    cv_noise_sd = 1,
                                    cv_change_over_time = -2, 
                                    relationship_coefficient = 0.01,
                                    patient_id_start_idx = n+1,
                                    seed=5555,
                                    )

# run the simulation
cv_data_high, death_data_high = high_sim_obj.simulate()
cv_data_low, death_data_low = nochange_sim_obj.simulate()

# concetaenate the dataframes for plotting
cv_plot_df = pd.concat([cv_data_high, cv_data_low])
death_plot_df = pd.concat([death_data_high, death_data_low])

# make the figure
fig = change_sim_obj.plot_continous_variable_over_time(cv_plot_df, death_plot_df)

cox_res_high = high_sim_obj.run_cox_timevary_cox_propotional_hazard(cv_data_high, death_data_high)
cox_res_high = pd.DataFrame(cox_res_high.summary)
cox_res_high.insert(0, 'sim_name', 'High Dependence')
cox_res_low  = high_sim_obj.run_cox_timevary_cox_propotional_hazard(cv_data_low, death_data_low)
cox_res_low = pd.DataFrame(cox_res_low.summary)
cox_res_low.insert(0, 'sim_name', 'Low Dependence')
res_df = pd.concat([cox_res_high, cox_res_low])
display(pd.DataFrame(res_df))

# change figure dimensions
fig.update_layout(
    height=1000,
)
pio.renderers.default = 'notebook'
fig
/Users/kevinanderson/Projects/az_takehome/problem_2/simulation.py:271: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

sim_name coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% cmp to z p -log2(p)
covariate
continuous_measure High Dependence -0.094442 0.909881 0.007031 -0.108223 -0.080661 0.897428 0.922506 0.0 -13.431775 3.938584e-41 134.221375
continuous_measure Low Dependence 0.003246 1.003251 0.009348 -0.015076 0.021567 0.985037 1.021801 0.0 0.347206 7.284369e-01 0.457124
In [ ]: